/*   */
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include <unistd.h>
#include<string.h>
#include <sys/time.h>
//#include <ctime>
//#include<gsl/gsl_rng.h>

//#define LENGTH 1073741824
#define LENGTH 33554432
#define BITS_PER_BYTE 32 // number of spins squashed into a byte
#define MAXTHREADS 512   // number of threads in a block, maximally
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)

//Type Definitions
typedef int EnergyType;//O(N^2)
#if BITS_PER_BYTE > 32
typedef unsigned long long int BondType;//O(1)
#else
typedef unsigned int BondType;//O(1)
#endif
typedef short int SpinType;//O(1)
typedef int FitType;//O(N)
typedef unsigned int Nat;//needs N<2^16
typedef unsigned long long int LongNat;//needs N<2^16


//global variables
short int pdf[LENGTH];
int debug;

/*
 inputs an integer n and converts it into a sequence of characters in *str,
 starting at str[i] and returning the integer i where "n" ends in *str
 (str[i]='\0').
 */
int inttostring(int n,int i,char *str){
    if(n < 0){
        str[i++] = '-';
        i = inttostring(-n,i,str);
    }else{
        if((n/10) > 0) i = inttostring(n/10,i,str);
        str[i++] = '0' + n%10;
        str[i] = '\0';
    }
    return(i);
}

__host__
__device__
void printarray(BondType *array,Nat n){
    Nat i,j;
    BondType xxx;
    
    for(i=0; i<n; i++){
        xxx = array[i];
        for(j=0; j<BITS_PER_BYTE; j++){
            printf("%1d",(int)(xxx%2));
            xxx /= 2;
        }
        printf(" ");
    }
    printf("\n");
}

LongNat encode(int i,int j,int k,LongNat L1,LongNat L2){// needed for coalescence!
    return(i*L2+j*L1+k);
}

/***************************************************
 This puts the bond matrix Jij into a linear form, Jlist, that optimizes coalescence later
 ***************************************************************/
EnergyType initInstance(BondType *Jlist,
                        Nat nv,
                        LongNat L1,
                        LongNat L2,
                        LongNat L3){
    int i,j,k,thisbit;
    BondType setbit[BITS_PER_BYTE];
    LongNat element,thisbyte;
    EnergyType balance=0;
    
    setbit[0] = (BondType)1;
    for(i=1; i<BITS_PER_BYTE; i++) setbit[i] = setbit[i-1] << 1;
    
    for(thisbyte=0; thisbyte<L3/BITS_PER_BYTE; thisbyte++) Jlist[thisbyte] = 0;//padding
    for(i=0; i<nv; i++){
        for(j=0; j<i; j++){
            for(k=0; k<j; k++){
                if(drand48() > 0.5) {//set +1 as 1 and leave -1 as 0
                    element = encode(i,j,k,L1,L2);
                    thisbyte = element/BITS_PER_BYTE;
                    thisbit = element%BITS_PER_BYTE;
                    Jlist[thisbyte] += setbit[thisbit];
                    element = encode(k,i,j,L1,L2);
                    thisbyte = element/BITS_PER_BYTE;
                    thisbit = element%BITS_PER_BYTE;
                    Jlist[thisbyte] += setbit[thisbit];
                    element = encode(j,k,i,L1,L2);
                    thisbyte = element/BITS_PER_BYTE;
                    thisbit = element%BITS_PER_BYTE;
                    Jlist[thisbyte] += setbit[thisbit];
                    element = encode(k,j,i,L1,L2);
                    thisbyte = element/BITS_PER_BYTE;
                    thisbit = element%BITS_PER_BYTE;
                    Jlist[thisbyte] += setbit[thisbit];
                    element = encode(j,i,k,L1,L2);
                    thisbyte = element/BITS_PER_BYTE;
                    thisbit = element%BITS_PER_BYTE;
                    Jlist[thisbyte] += setbit[thisbit];
                    element = encode(i,k,j,L1,L2);
                    thisbyte = element/BITS_PER_BYTE;
                    thisbit = element%BITS_PER_BYTE;
                    Jlist[thisbyte] += setbit[thisbit];
                    ++balance;
                }else{
                    --balance;
                }
            }
        }
    }
    //    for(j=0; j<nv; j++){
    //        for(i=0; i<nv; i++){
    //            printf("%5d %5d  %20llu:",j,i,Jlist[LongNat(j*L2+i*L1+BITS_PER_BYTE-1)/BITS_PER_BYTE]);
    //            printarray(&Jlist[LongNat(j*L2+i*L1+BITS_PER_BYTE-1)/BITS_PER_BYTE],(nv+BITS_PER_BYTE-1)/BITS_PER_BYTE);
    //        }
    //        printf("\n");
    //    }
    return(balance);
}


__global__
void ThisEnergy(BondType *Jlist,
                BondType *spinByte,
                EnergyType *energy,
                Nat nv){//independent calculation of E
    __shared__ EnergyType r[MAXTHREADS];//needed for efficient reduction
    unsigned int blockSize = blockDim.x;
    int i,j,c;
    SpinType thisrowspin;
    BondType Bytefitness,thisspinByte,thatspinByte,thisbondByte;
    
    r[threadIdx.x] = 0;
    for(j=0; j<nv; j++){
        if( (j%BITS_PER_BYTE)==0 ){
            thatspinByte = spinByte[j/BITS_PER_BYTE];
        }
        thisrowspin = 2*(thatspinByte%2) - 1;
        i=threadIdx.x;
        while(i < nv){
            thisbondByte = Jlist[j*nv+i];
            thisspinByte = spinByte[i];
            Bytefitness = thisspinByte ^ thisbondByte;// = -s_i J_ij
            for (c = 0; Bytefitness; c++) Bytefitness &= Bytefitness - 1;
            r[threadIdx.x] += thisrowspin*(BITS_PER_BYTE-2*c);
            i += blockSize;
        }
        thatspinByte /= 2;
    }
    __syncthreads();
    for (j=blockSize/2; j>0; j>>=1) {//reduction
        if (threadIdx.x < j) {
            r[threadIdx.x] += r[threadIdx.x + j];
        }
        __syncthreads();
    }
    if(threadIdx.x == 0){
        *energy = (r[0]+nv)/2;
    }
}



/******************************************************************/
/*                                                                */
/*      tau-probability distribution                              */
/*                                                                */
/*  This provides a discrete approximation for P(k)~k^{-tau},     */
/* 1 <= k <= nv. It maps the distribution onto a large array where*/
/* entries = k occur with frequency in occordance with P(k).      */
/* Shortcoming: if "LENGTH" too small, and/or "tau" too large,    */
/* there will be only values of k<n_0<nv for some cutoff n_0, ie. */
/* certain ranks can't be reached!                                */
/*                                                                */
/******************************************************************/


void initrank(double tau,Nat nv){
    int i;
    double a,b;
    a = (1-pow(nv+1,1-tau))/LENGTH;
    b = 1/(1-tau);
    for(i = 0; i<LENGTH; i++){
        pdf[i] = (int) pow(1-i*a,b);
    }
}


/* taupick: rank is selected according to the tau-probability distribution.
 */
int taupick(void){
    return(pdf[(int)(LENGTH*drand48())]);
}

/*********************** END tau probability **********************/


__global__
void initspinByte(BondType *spinByte,
                  int *magnit,
                  Nat RandomNat,
                  Nat nv){
    unsigned int blockSize = blockDim.x;
    unsigned int index = blockIdx.x*blockSize + threadIdx.x;
    BondType setbit;
    unsigned int indexmod=index%BITS_PER_BYTE;
    unsigned int indexdiv=index/BITS_PER_BYTE;
    
    if(index < nv){
        setbit = (RandomNat/(index+1))%2;
        setbit ? atomicAdd(magnit,+1) : atomicAdd(magnit,-1);
        setbit <<= indexmod;
        atomicXor(&spinByte[indexdiv],setbit);
    }
    
    //     __syncthreads();
    //    if(index == 0){
    //        printf("spins:");
    //        printarray(spinByte,(nv+BITS_PER_BYTE-1)/BITS_PER_BYTE);
    //    }
}


/********************************************************************
 Simple, one-block implementation of reduction to get the current energy. If nv>MAXTHREADS,
 first loop-sum elements in each thread, then reduce threads.
 Here, everything must be padded to have powers-of-2!
 ********************************************************************/
__global__
void calculateEnergy(FitType *fitness,
                     EnergyType *energy,
                     Nat nv){
    __shared__ EnergyType r[MAXTHREADS];//needed for efficient reduction
    unsigned int blockSize = blockDim.x;
    int i,j;
    EnergyType thisr=0;
    
    i = threadIdx.x;
    while(i < nv){//loop-add to fit into a single block!
        thisr += fitness[i];//fitness=0 for i>=nv
        i += blockSize; //best for coalescence
    }
    r[threadIdx.x] = thisr;
    __syncthreads();
    for (j=blockSize/2; j>0; j>>=1) {//reduction
        if (threadIdx.x < j) {
            r[threadIdx.x] += r[threadIdx.x + j];
        }
        __syncthreads();
    }
    // write result for this block to global mem
    if (threadIdx.x == 0){
        *energy = -r[0]/6;
    }
}

__device__
void warpReduce(volatile unsigned long long int *r, unsigned int tid,unsigned int blockSize) {
    if (blockSize >= 64) r[tid] += r[tid + 32];
    if (blockSize >= 32) r[tid] += r[tid + 16];
    if (blockSize >= 16) r[tid] += r[tid + 8];
    if (blockSize >= 8) r[tid] += r[tid + 4];
    if (blockSize >= 4) r[tid] += r[tid + 2];
    if (blockSize >= 2) r[tid] += r[tid + 1];
}

/*****  updatelocalH<<<nv,MAXTHREADS,BytesPerRow*sizeof(BondType)>>>(Jlist,spinByte,fitness,BytesPerRow,L2);   ****/
__global__
void updatelocalH(BondType *Jlist,//linear list of Jij bond matrix
                  BondType *spinByte,
                  FitType *localfield,
                  Nat BytesPerRow,
                  LongNat nv2){
    extern __shared__ BondType sB[];
    __shared__ int r[MAXTHREADS];//needed for efficient reduction
    unsigned int thisSpinIndex,blockSize = blockDim.x;
    unsigned int i,j,thisSpin,offset=blockIdx.x*nv2;
    int help,thisr=0;
    
    i = threadIdx.x;
    if(i < BytesPerRow) sB[i] = spinByte[i];
    __syncthreads();
    
    while(i < nv2){
#if BITS_PER_BYTE > 32
        help = __popcll(sB[i%BytesPerRow] ^ Jlist[offset + i]);//# of s_k J_ijk < 0
#else
        help = __popc(sB[i%BytesPerRow] ^ Jlist[offset + i]);
#endif
        thisSpinIndex = i/BytesPerRow;
        thisSpin = ( (sB[thisSpinIndex/BITS_PER_BYTE]) >> (thisSpinIndex%BITS_PER_BYTE) )%2;
        thisr += thisSpin ? +help : -help; //Is s_j s_k J_ijk < 0 ?
        i += blockSize;
    }
    //   atomicAdd(&localfield[blockIdx.x],thisr);
    
    r[threadIdx.x] = thisr;
    __syncthreads();
    /*
     
     if (blockSize >= 1024) {
     if (threadIdx.x < 512) { r[threadIdx.x] += r[threadIdx.x + 512]; }
     __syncthreads();
     }
     if (blockSize >= 512) {
     if (threadIdx.x < 256) { r[threadIdx.x] += r[threadIdx.x + 256]; }
     __syncthreads();
     }
     if (blockSize >= 256) {
     if (threadIdx.x < 128) { r[threadIdx.x] += r[threadIdx.x + 128]; }
     __syncthreads();
     }
     if (blockSize >= 128) {
     if (threadIdx.x < 64) { r[threadIdx.x] += r[threadIdx.x + 64]; }
     __syncthreads();
     }
     if (threadIdx.x < 32) warpReduce(r, threadIdx.x, blockSize);
     */
    
    for (j=blockSize/2; j>0; j>>=1) {//reduction
        if (threadIdx.x < j) {
            r[threadIdx.x] += r[threadIdx.x + j];
        }
        __syncthreads();
    }
    
    if (threadIdx.x == 0){
        localfield[blockIdx.x] = (FitType)r[0];  //provisional value of fitness!!!
        //       printf("localH: %5d %5d %5d\n",blockIdx.x,BytesPerRow,r[0]);
    }
}


/****  updatefitness<<<mingrid,MAXTHREADS>>>(spinByte,fitness,unstablespins,magnit,RandomNat,nv);  ****/
__global__
void updatefitness(BondType *spinByte,
                   FitType *fitness,
                   int *unstablespins,
                   int *magnit,
                   Nat RandomNat,
                   Nat nv){
    unsigned int blockSize = blockDim.x;
    unsigned int index = blockIdx.x*blockSize + threadIdx.x;
    unsigned int indexmod=index%BITS_PER_BYTE;
    unsigned int indexdiv=index/BITS_PER_BYTE;
    unsigned int thisspin;
    BondType setbit;
    FitType thisfitness=0;
    int oldmagnit=*magnit;
    
    //    if(threadIdx.x == 0){
    //        printf("M=%d\n",oldmagnit);
    //        printarray(spinByte,(nv+BITS_PER_BYTE-1)/BITS_PER_BYTE);
    //    }
    //    __syncthreads();
    
    setbit = (BondType)1 << indexmod;
    if(index < nv){
        thisfitness = (oldmagnit+1)*nv - 2 - 2*fitness[index];//since protofitness=sum_of_negative
        thisspin = setbit & spinByte[indexdiv];
        thisfitness = thisspin ? +thisfitness : -thisfitness;
        thisfitness += 2*oldmagnit;//makes up for J_iij = -1!!!
        //        printf("spin: %5d %5d \n",index,thisfitness);
        if(thisfitness < 0){
            atomicAdd(unstablespins,1);
            //if( (RandomNat/(index+1))%4 ){//flip with 75%
            if( (RandomNat/(index+1))%8 ){//flip with 88%
                atomicXor(&spinByte[indexdiv],setbit);
                thisspin ? atomicAdd(magnit,-2) : atomicAdd(magnit,+2);
                //                              printf("flip: %5d %5d %15llu\n",index,thisfitness,spinByte[indexdiv]);
            }
        }
    }
    fitness[index] = thisfitness;//final fitness
    //       if(index < nv)printf("uf: %10d %10d %10d %10d\n",index,thisfitness,*unstablespins,*magnit);
}

/********************************************************************
 This should run only rarely
 ********************************************************************/

__global__
void destablize(BondType *spinByte,
                FitType *fitness,
                int *unstablespins,
                int *threshold,
                int *modulator,
                int *magnit,
                Nat RandomNat,
                Nat nv){
    unsigned int blockSize = blockDim.x;
    unsigned int index = blockIdx.x*blockSize + threadIdx.x;
    unsigned int indexmod=index%BITS_PER_BYTE;
    unsigned int indexdiv=index/BITS_PER_BYTE;
    BondType setbit;
    int thisfitness,thisspin;
    
    //    int oldmagnit=*magnit;
    //
    //     if(index == 0){
    //     printf("spins in: M=%d   ",oldmagnit);
    //     printarray(spinByte,(nv+BITS_PER_BYTE-1)/BITS_PER_BYTE);
    //     }
    //     __syncthreads();
    
    if(index < nv){
        thisfitness = (int)fitness[index];
        if( !(thisfitness > *threshold) ){
            setbit = (BondType)1 << indexmod;
            thisspin = setbit & spinByte[indexdiv];
            if(thisfitness == *threshold){
                if( (RandomNat/(index+1))%(*modulator) == 0 ){
                    atomicAdd(unstablespins,1);
                    atomicXor(&spinByte[indexdiv],setbit);
                    thisspin ? atomicAdd(magnit,-2) : atomicAdd(magnit,+2);
                    //         printf("flip: %5d %5d %5d\n",index,thisfitness,*threshold);
                }
            }else{
                atomicAdd(unstablespins,1);
                atomicXor(&spinByte[indexdiv],setbit);
                thisspin ? atomicAdd(magnit,-2) : atomicAdd(magnit,+2);
                //      printf("flip: %5d %5d %5d\n",index,thisfitness,*threshold);
            }
        }
    }
    
    //     __syncthreads();
    //     if(index == 0){
    //     printf("spinsout: M=%d   ",*magnit);
    //     printarray(spinByte,(nv+BITS_PER_BYTE-1)/BITS_PER_BYTE);
    //     }
    
}


__global__
void makeHistogram(FitType *fitness,
                   int *histogram,
                   Nat nv){
    //    extern __shared__ short int r[];
    unsigned int blockSize = blockDim.x;
    unsigned int j,index=threadIdx.x;
    FitType thisfitness;
    
    while(index < nv){
        //       r[index] = 0;
        for(j=0; j<nv; j++) histogram[index*nv+j] = 0;
        index += blockSize;
    }
    __syncthreads();
    index = threadIdx.x;
    while(index < nv){
        thisfitness = fitness[index];
        if(thisfitness < 0){
            printf("Negative fitness should not happen with histogram!");
        }else{
            //   atomicAdd(&r[thisfitness],1);
            atomicAdd(&histogram[thisfitness],1);
        }
        index += blockSize;
    }
    //
    //     __syncthreads();
    //    if(threadIdx.x == 0){
    //        for(index=0; index < nv*nv; index++){
    //            //       histogram[index] = r[index];
    //            if(histogram[index])printf("P----> %6d %10d\n",(int)(index+2)/4-1,(int)histogram[index]);
    //        }
    //        printf("===========================================\n");
    //     }
    //
}

__global__
void makeThreshold(int *histogram,
                   int *threshold,
                   int *modulator,
                   int taupick){
    int j,thisunstable;
    float overage;
    int thisthreshold=0;
    
    thisunstable = histogram[0];
    //  printf("set threshold: %6d %6d %6d\n",taupick,thisthreshold,thisunstable);
    while(thisunstable < taupick){
        thisunstable += histogram[++thisthreshold];
        //       printf("set threshold: %6d %6d %6d\n",taupick,thisthreshold,thisunstable);
    }
    overage = 1.0 - (float)(thisunstable - taupick)/histogram[thisthreshold];
    j = 1;
    while(overage < (j+0.5)/j/(j+1.0)){
        ++j;
        //      printf("modulator: %f %d\n",overage,j);
    }
    *modulator = j;
    *threshold = thisthreshold;
}


void opennewfile(char *fname,int nv,float tau,int seed,int maxruns,int runpow,float runfac){
    int i=0;
    FILE *fp;
    
    i = inttostring(nv,i,fname);
    fname[i++] = '/';
    fname[i++] = 'V';
    fname[i++] = 'E';
    fname[i++] = 'O';
    fname[i++] = 'g';
    fname[i++] = 'p';
    fname[i++] = 'u';
    fname[i++] = '1';
    fname[i++] = '3';
    fname[i++] = '_';
    i = inttostring((int)(100*tau+0.5),i,fname);
    fname[i++] = '_';
    i = inttostring(seed,i,fname);
    fp = fopen(fname,"w");
    fprintf(fp,"#  nnv  tau       seed\n");
    fprintf(fp,"#%7d\t %5.2f\t %10d\t %5d\t %5.2f\t %5d\n",nv,tau,seed,runpow,runfac,maxruns);
    fclose(fp);
}

void openhistofile(char *fname,int *besthistogram,int nv,float tau,int seed){
    int i=0;
    
    i = inttostring(nv,i,fname);
    fname[i++] = '/';
    fname[i++] = 'V';
    fname[i++] = 'E';
    fname[i++] = 'O';
    fname[i++] = 'g';
    fname[i++] = 'p';
    fname[i++] = 'u';
    fname[i++] = '1';
    fname[i++] = '3';
    fname[i++] = 'h';
    fname[i++] = 'i';
    fname[i++] = 's';
    fname[i++] = 't';
    fname[i++] = 'o';
    fname[i++] = '_';
    i = inttostring((int)(100*tau+0.5),i,fname);
    fname[i++] = '_';
    i = inttostring(seed,i,fname);
    for(i=0; i<nv*nv; i++) besthistogram[i] = 0;
}

void out_run(char *fname,int nv,int inst,EnergyType minenergy,float minsweep,float minflips,int runtime,EnergyType balance,int bestmagnit){
    FILE *fp;
    
    fp = fopen(fname,"a");
    fprintf(fp,"%7d %10d %15.5lf %15.2g %15.2g %5d %15.5lf %7d\n",nv,inst,(double)minenergy,minsweep,minflips,runtime,(double)balance,bestmagnit);
    fclose(fp);
}

void out_histo(char *fname,int *histo,int nv,int inst){
    FILE *fp;
    int i;
    fp = fopen(fname,"w");
    for(i=0; i<nv*nv; i++){
        if(histo[i]) fprintf(fp,"%10d %15d %10d %10d\n",i,histo[i],nv,inst);
    }
    fclose(fp);
}



void commandlineparse(int argc, char **argv,Nat *nv,int *maxinst,int *maxruns,float *p,float *tau,int *seed,int *runpow,float *runfac,int *thisGPU){
    int i;
    char outfile[100];
    char mkdir[100];
    FILE *fp;
    
    *seed = -1;
    debug = 0;
    *thisGPU = 0;
    for (i = 1; i < argc; i++){  //Start at i = 1 to skip the command name.
        if (argv[i][0] == '-'){
            switch (argv[i][1]){
                case 'G':       *thisGPU = atoi(argv[++i]);
                    break;
                case 'n':       *nv = atoi(argv[++i]);
                    break;
                case 'd':       *p = 0.001 * atoi(argv[++i]);
                    break;
                case 'i':       *maxinst = atoi(argv[++i]);
                    break;
                case 'm':       *maxruns = atoi(argv[++i]);
                    break;
                case 't':       *tau = 0.01*(double)atoi(argv[++i]);
                    break;
                case 's':       *seed = atoi(argv[++i]);
                    break;
                case 'p':       *runpow = atoi(argv[++i]);
                    break;
                case 'r':       *runfac = 0.01*atoi(argv[++i]);
                    break;
                case 'D':       debug = atoi(argv[++i]);
                    break;
                default:
                    fprintf(stderr,"\nError:  Incorrect option %s\n",argv[i]);
                    fprintf(stderr,"\nAvailable options: \n\
       -G cudeSetDevice (Default=0)  \n\
       -n SystemSize  \n\
       -d average bond density*1000   \n\
       -p runpow:  SystemSize**nvpow ...  \n\
       -r runfac*100:  ... *runfac => total runtime  \n\
       -i instances:  set number of instances to run \n\
       -m maxruns within EO \n\
       -t tau*100: EO parameter tau\n\
       -s seed: random seed\n\
       -D debug info if >0 [default=0] \n\
       \n");
                    exit(0);
            }//switch
        }//if
    }//for
    //set up output directory and log-file
    inttostring(*nv,0,outfile);
    fp = fopen(strcat(outfile,"/README"), "a");
    if(fp == NULL){
        inttostring(*nv,0,outfile);
        mkdir[0] = '\0';
        strcat(mkdir,"mkdir  ");
        system(strcat(mkdir,outfile));
        fp = fopen(strcat(outfile,"/README"), "w");
        fprintf(fp,"tau,maxinstance,seed,maxruns,runpow,runfac\n");
    }
    fprintf(fp,"%6.4g %8d %8d %8d %4d %f\n",*tau,*maxinst,*seed,*maxruns,*runpow,*runfac);
    fclose(fp);
}



/*************************************************************/
/*                                                           */
/*                                                           */
/*                 Main Program                              */
/*                                                           */
/*                                                           */
/*                                                           */
/*************************************************************/

int main(int argc, char **argv){
    char histofile[100],outfile[100];
    BondType  *h_Jlist,*Jlist;
    BondType *spinByte;
    FitType *fitness;
    int *histogram,*h_besthisto,*besthistogram;
    double counter;
    int *unstablespins,h_unstablespins;
    Nat BytesPerRow;
    Nat nv,RandomNat;
    LongNat L1,L2,L3;
    int *threshold;
    int *modulator;
    int seed,maxinst,inst,runpow,j,mingrid,thisGPU;
    long long int sweep,maxsweep,flips;
    float minsweep,minflips,runfac,tau,p;
    int i,run,maxruns,runstop;
    EnergyType balance,minenergy;
    EnergyType h_energy, *energy;
    int *magnit,h_magnit,minmagnit;
    
    commandlineparse(argc,argv,&nv,&maxinst,&maxruns,&p,&tau,&seed,&runpow,&runfac,&thisGPU);
    cudaSetDevice(thisGPU);
    
    /***********************************************************************/
    /*   Each NxN bond matrix Jij is represented in binary form, with each i-th row squashed into */
    /*   BytesPerRow = N/BITS_PER_BYTE  many bytes, each byte holding BITS_PER_BYTE entries, where*/
    /*   "0"=-1  and "1"=+1.  */
    /***********************************************************************/
    
    BytesPerRow = (Nat)(nv+BITS_PER_BYTE-1)/BITS_PER_BYTE; //#bytes needed per row
    mingrid = (nv+MAXTHREADS-1)/MAXTHREADS;
    L1 = (LongNat)(BytesPerRow*BITS_PER_BYTE);
    L2 = (LongNat)(nv*L1);
    L3 = (LongNat)(nv*L2);
    
    //    printf("nv\t BytesPerRow\t MAXTHREADS\t mingrid\t\t L2\t\t\t L3\n");
    //    printf("%d\t %d\t\t %d\t\t %d\t %20.0lf\t %20.0lf\n",nv,BytesPerRow,MAXTHREADS,mingrid,(double)L2,(double)L3);
    //Memory for all arrays on host
    h_Jlist = (BondType *) malloc (sizeof(BondType) * L3/BITS_PER_BYTE);
    h_besthisto = (int *) malloc (sizeof(int) * nv*nv);
    besthistogram = (int *) malloc (sizeof(int) * nv*nv);

    //Memory for all arrays on device
    cudaMalloc((void**)&Jlist, sizeof(BondType) * L3/BITS_PER_BYTE);
    cudaMalloc((void**)&spinByte, sizeof(BondType) * BytesPerRow);
    cudaMalloc((void**)&fitness, sizeof(FitType) * nv);
    cudaMalloc((void**)&threshold, sizeof(int));
    cudaMalloc((void**)&histogram, sizeof(int) * nv*nv);
    cudaMalloc((void**)&modulator, sizeof(int));
    cudaMalloc((void**)&unstablespins, sizeof(int));
    cudaMalloc((void**)&magnit, sizeof(int));
    cudaMalloc((void**)&energy, sizeof(EnergyType));
    cudaCheckErrors("cudamalloc fail");
    
    maxsweep = (long long int)(runfac*pow(nv,runpow))+100;
    initrank(tau,nv);
    //    printf("%d %15.0lf %d\n",maxinst,(double)maxsweep,maxruns);
    opennewfile(outfile,nv,tau,seed,maxruns,runpow,runfac);
    openhistofile(histofile,besthistogram,nv,tau,seed);
    counter = 0;
    for(inst=0; inst<maxinst; inst++){
        srand48(seed*(inst+1));
        for(j = 0; j <100; j++) {
            drand48(); //Do some initial cycling on the random generator
        }
        balance = initInstance(h_Jlist,nv,L1,L2,L3);
        cudaMemcpy(Jlist, h_Jlist, sizeof(BondType) * L3/BITS_PER_BYTE,cudaMemcpyHostToDevice);
        //       cudaCheckErrors("cudamemcpy or cuda kernel fail");
        minenergy = 0;
        run = 0;
        runstop = maxruns;
        while(run < runstop){
            RandomNat = (Nat)(drand48() * 0.5 * (unsigned int)(-1));
            cudaMemset(spinByte,0,BytesPerRow * sizeof(BondType));//need to fill all with "0" for padding
            cudaMemset(magnit,0,sizeof(int));
            initspinByte<<<mingrid,MAXTHREADS>>>(spinByte,magnit,RandomNat,nv);
            flips = 0;
            for(sweep=0; sweep<maxsweep; ++sweep){
                cudaMemset(unstablespins,0,sizeof(int));
                RandomNat = (Nat)(drand48() * 0.5 * (unsigned int)(-1));
                updatelocalH<<<nv,MAXTHREADS,BytesPerRow*sizeof(BondType)>>>(Jlist,spinByte,fitness,BytesPerRow,L2/BITS_PER_BYTE);
                updatefitness<<<mingrid,MAXTHREADS>>>(spinByte,fitness,unstablespins,magnit,RandomNat,nv);
                cudaMemcpy(&h_unstablespins,unstablespins,sizeof(int),cudaMemcpyDeviceToHost );
                if(!h_unstablespins){
                    makeHistogram<<<1,MAXTHREADS>>>(fitness,histogram,nv);
                    cudaMemcpy(&h_magnit,magnit,sizeof(int),cudaMemcpyDeviceToHost );
                    calculateEnergy<<<1,MAXTHREADS>>>(fitness,energy,nv);
                    while(!h_unstablespins){
                        makeThreshold<<<1,1>>>(histogram,threshold,modulator,taupick());
                        RandomNat = (Nat)(drand48() * 0.5 * (unsigned int)(-1));
                        destablize<<<mingrid,MAXTHREADS>>>(spinByte,fitness,unstablespins,threshold,modulator,magnit,RandomNat,nv);
                        cudaMemcpy(&h_unstablespins,unstablespins,sizeof(int),cudaMemcpyDeviceToHost );
                    }
                    cudaMemcpy(&h_energy,energy,sizeof(EnergyType),cudaMemcpyDeviceToHost );
                    if(h_energy < minenergy){
                        minsweep = (float)sweep+(float)run*(float)maxsweep;
                        minflips = flips;
                        runstop = MAX(runstop,2*run+1);
                        minenergy = h_energy;
                        minmagnit = h_magnit;
                        cudaMemcpy(h_besthisto,histogram,sizeof(int) * nv*nv,cudaMemcpyDeviceToHost);
                    }//end if
                }
                flips += h_unstablespins;
            } //end for sweep
            ++run;
        }//end while run
        out_run(outfile,nv,inst,minenergy,minsweep,minflips,runstop,balance,minmagnit);
        for(i=0; i<nv*nv; i++) besthistogram[i] += h_besthisto[i];
        counter += minsweep;
        if(counter > 10000){
            counter = 0;
            out_histo(histofile,besthistogram,nv,inst);
        }
    } //end for inst
    out_histo(histofile,besthistogram,nv,inst);
    return(0);
}
